# =========================
# Kaggle House Prices
# Strong Baseline (Top Features Only)
# =========================

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# -------------------------
# Load data
# -------------------------
train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test  = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

# -------------------------
# Remove strong outliers
# -------------------------
train = train.drop(train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index)

# -------------------------
# Target transform
# -------------------------
y = np.log1p(train["SalePrice"])
train.drop("SalePrice", axis=1, inplace=True)

# -------------------------
# Select important features
# -------------------------
features = [
    "OverallQual",
    "GrLivArea",
    "TotalBsmtSF",
    "GarageCars",
    "GarageArea",
    "YearBuilt",
    "YearRemodAdd",
    "Neighborhood",
    "ExterQual",
    "KitchenQual",
    "MasVnrArea",
]

X = train[features].copy()
X_test = test[features].copy()

# -------------------------
# Feature engineering
# -------------------------
for df in [X, X_test]:
    df["HouseAge"] = 2010 - df["YearBuilt"]
    df["RemodAge"] = 2010 - df["YearRemodAdd"]
    df["TotalSF"] = df["GrLivArea"] + df["TotalBsmtSF"]
    df["QualitySF"] = df["OverallQual"] * df["TotalSF"]

# -------------------------
# Fill missing values
# -------------------------
for col in ["MasVnrArea", "TotalBsmtSF", "GarageArea"]:
    X[col] = X[col].fillna(0)
    X_test[col] = X_test[col].fillna(0)

for col in ["GarageCars"]:
    X[col] = X[col].fillna(0)
    X_test[col] = X_test[col].fillna(0)

for col in ["ExterQual", "KitchenQual", "Neighborhood"]:
    X[col] = X[col].fillna("None")
    X_test[col] = X_test[col].fillna("None")

cat_cols = ["Neighborhood", "ExterQual", "KitchenQual"]
for c in cat_cols:
    X[c] = X[c].astype("category")
    X_test[c] = X_test[c].astype("category")

# -------------------------
# LightGBM parameters
# -------------------------
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.03,
    "num_leaves": 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42,
    "verbosity": -1
}

# -------------------------
# Cross-validation
# -------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X))
preds = np.zeros(len(X_test))

for train_idx, valid_idx in kf.split(X):
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=["Neighborhood", "ExterQual", "KitchenQual"])
    lgb_valid = lgb.Dataset(X_val, y_val, categorical_feature=["Neighborhood", "ExterQual", "KitchenQual"])

    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=3000,
        valid_sets=[lgb_valid],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=0)
        ]
    )

    oof[valid_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    preds += model.predict(X_test, num_iteration=model.best_iteration) / kf.n_splits

rmse = mean_squared_error(y, oof, squared=False)
print(f"CV RMSE: {rmse:.5f}")

# -------------------------
# Submission
# -------------------------
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": np.expm1(preds)
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved")
